Introduction

Basic data visualization of beatmap info provided by osu! API. https://github.com/ppy/osu-api/wiki Only uses ranked/loved/qualified maps. Graphs focusing on standard mode. (A few maps, due to ranked/loved irregularities, are actually in graveyard but included in the API query anyway. Ex. https://osu.ppy.sh/b/766190&m=2 with 1 loved CtB diff. )

library(ggplot2)
library(gridExtra)
library(plyr)
library(jsonlite)
library(varhandle)
# Import data from JSON and remove duplicate rows
beatmaps <- unique(do.call("rbind", fromJSON("maps.json")))
# Convert strings of ints and floats to numeric datatypes
beatmaps.isintcol <- sapply(beatmaps, function(col) all(check.numeric(col, only.integer=TRUE))) 
beatmaps.isnumcol <- sapply(beatmaps, function(col) all(check.numeric(col))) & !beatmaps.isintcol
beatmaps[, beatmaps.isintcol] = sapply(beatmaps[, beatmaps.isintcol], as.integer)
beatmaps[, beatmaps.isnumcol] = sapply(beatmaps[, beatmaps.isnumcol], as.numeric)

# Convert MySQL datetimes to R datetimes. As far as the new site goes, this appears to be UTC-4, but
# this may be a local thing. Probably broken. 
beatmaps$approved_date <- as.POSIXct(beatmaps$approved_date, tz="Etc/GMT+4")
beatmaps$last_update <- as.POSIXct(beatmaps$last_update, tz="Etc/GMT+4")


# Create labels and data frames for each gamemode
gamemodes <- c("std", "taiko", "ctb", "mania")
gamemode.labels <- c("Standard", "Taiko", "CtB", "Mania")
beatmaps$mode <- factor(beatmaps$mode, labels=gamemode.labels)
for (i in 1:4) {
  assign(gamemodes[i], beatmaps[beatmaps$mode == gamemode.labels[i],])
}

# Various plot parameters for convenience
# These will usually leave a few outlier maps out
diff_x_scale        <- scale_x_continuous(limits=c(0,10), breaks=seq(0,10,0.5))
diff_hist           <- geom_histogram(binwidth=0.05)
length_x_scale      <- scale_x_continuous(limits=c(0,600), breaks=seq(0,600,30))
length_hist         <- geom_histogram(binwidth=1)
AR_y_scale          <- scale_y_continuous(breaks=seq(0,10))
SR_y_scale          <- scale_y_continuous(limits=c(0,10), breaks=seq(0,10,1))
approved_x_scale    <- scale_x_datetime(date_breaks="1 year", date_labels="%Y")

legend_title_fill   <- labs(fill="Mode")
legend_title_color  <- labs(color="Mode")

# Center titles
theme_update(plot.title = element_text(hjust = 0.5))

Plots

# Histogram of star rating (all modes)
ggplot(beatmaps, aes(difficultyrating, fill=as.factor(mode))) + 
  ggtitle("Total Star Rating (All Modes)") + legend_title_fill +
  diff_x_scale + diff_hist
## Warning: Removed 61 rows containing non-finite values (stat_bin).

# Frequency polygon of SR (all modes)
ggplot(beatmaps, aes(difficultyrating, color=as.factor(mode))) + 
  ggtitle("Star Rating (All Modes)") + legend_title_color +
  diff_x_scale + geom_freqpoly(binwidth=0.05) 
## Warning: Removed 61 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).

# Histograms of SR (all modes) by faceting
ggplot(beatmaps, aes(difficultyrating)) + 
  diff_x_scale + diff_hist +
  facet_wrap("mode") + 
  labs(title = "Star Rating Distributions (All Modes)")
## Warning: Removed 61 rows containing non-finite values (stat_bin).

# Histograms of SR (all modes) with separate y scales
diffplots0 <- ggplot(std  , aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Standard")
diffplots1 <- ggplot(taiko, aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Taiko")
diffplots2 <- ggplot(ctb  , aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("CtB")
diffplots3 <- ggplot(mania, aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Mania")

grid.arrange(diffplots0, diffplots1, diffplots2, diffplots3, 
             top="Star Rating Distributions (All Modes)")
## Warning: Removed 9 rows containing non-finite values (stat_bin).
## Warning: Removed 15 rows containing non-finite values (stat_bin).
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 13 rows containing non-finite values (stat_bin).

# Histogram of total length (all modes)
ggplot(beatmaps, aes(total_length, fill=as.factor(mode))) + 
  ggtitle("Total Beatmap Length (All Modes)") + legend_title_fill +
  length_x_scale + length_hist
## Warning: Removed 117 rows containing non-finite values (stat_bin).

# Frequency polygon of total length (all modes)
ggplot(beatmaps, aes(total_length, color=as.factor(mode))) + 
  ggtitle("Beatmap Length (All Modes)") + legend_title_color +
  length_x_scale + geom_freqpoly(binwidth=1)
## Warning: Removed 117 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).

# Histograms of total length (all modes)
lengthplots0 <- ggplot(std  , aes(total_length)) + length_x_scale + length_hist + ggtitle("Standard")
lengthplots1 <- ggplot(taiko, aes(total_length)) + length_x_scale + length_hist + ggtitle("Taiko")
lengthplots2 <- ggplot(ctb  , aes(total_length)) + length_x_scale + length_hist + ggtitle("CtB")
lengthplots3 <- ggplot(mania, aes(total_length)) + length_x_scale + length_hist + ggtitle("Mania")

grid.arrange(lengthplots0, lengthplots1, lengthplots2, lengthplots3,
             top="Beatmap Length Distributions (All Modes)")
## Warning: Removed 88 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing non-finite values (stat_bin).

# Frequency polygons of playcount (all modes)
ggplot(beatmaps, aes(playcount, color=as.factor(mode))) + 
  ggtitle("Playcount (All Modes)") + legend_title_color +
  scale_x_continuous(limits=c(0,1000000)) + 
  geom_freqpoly(binwidth=5000)
## Warning: Removed 1019 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).

# Histogram of date approved (all modes)
ggplot(beatmaps, aes(approved_date, fill=as.factor(mode))) + 
  ggtitle("Total Date Approved (All Modes)") + legend_title_fill +
  approved_x_scale + geom_histogram(binwidth=3600*24*30)

# Frequency polygon of date approved (all modes)
ggplot(beatmaps, aes(approved_date, color=as.factor(mode))) +
  ggtitle("Date Approved (All Modes)") + legend_title_color +
  approved_x_scale + geom_freqpoly(binwidth=3600*24*30)

Tables

# Most frequent artists, titles, sources, and creators
head(sort(table(beatmaps$artist), decreasing=TRUE), 10)
## 
##    Hatsune Miku          ClariS          KOTOKO        fripSide 
##             653             421             398             362 
##              xi           senya           IOSYS Various Artists 
##             349             339             331             329 
##            LiSA      yanaginagi 
##             323             311
head(sort(table(beatmaps$title), decreasing=TRUE), 10)
## 
##                                Piano 7K BMS Pack 
##                                              193 
##                                Piano Beatmap Set 
##                                              114 
##                                 Harumachi Clover 
##                                               70 
##                                           Granat 
##                                               65 
##                                   Ai no Scenario 
##                                               62 
## PEPPY FIX TAIKO STAR RATING PLEASE for a happier 
##                                               61 
##                         Tokyo (Innovaderz Remix) 
##                                               61 
##                                            MIIRO 
##                                               58 
##                              Hitorigoto -TV MIX- 
##                                               55 
##                                       Re:TrymenT 
##                                               54
head(sort(table(beatmaps$source), decreasing=TRUE), 10)
## 
##                                                                    Touhou 
##                                24652                                 2370 
##                                  BMS                          東方Project 
##                                 1557                                  754 
##        SOUND VOLTEX III GRAVITY WARS                                DJMAX 
##                                  466                                  455 
##                       beatmania IIDX SOUND VOLTEX II -infinite infection- 
##                                  398                                  390 
##                                 osu!                             Vocaloid 
##                                  258                                  242
head(sort(table(beatmaps$creator), decreasing=TRUE), 10)
## 
## osuplayer111      Sotarks        DJPop     tutuhaha        ztrot 
##          572          571          563          387          377 
##        Larto        Natsu    Monstrata   Ascendance     pishifat 
##          354          345          331          311          311
# Most favorited mapsets (mapset defined by creator, artist, and title)
mapsets <- beatmaps[!duplicated(beatmaps[c("creator","artist","title")]),]
most_fav <- head(arrange(mapsets, desc(favourite_count)), 20)
subset(most_fav, select=c("favourite_count","creator","title"))
##    favourite_count     creator                            title
## 1            13163   W h i t e                          My Love
## 2            10292        Fort                        Highscore
## 3             9763 jonathanlfj                        Tear Rain
## 4             8282  Charles445    Liquid (Paul Rosenthal Remix)
## 5             7604      VINXIS                         No title
## 6             6886       Ekoro           Everything will freeze
## 7             6880       Kuria        Guren no Yumiya (TV Size)
## 8             6606     Doormat              Hitorigoto -TV MIX-
## 9             6534      Awaken                     Toumei Elegy
## 10            6335    Voltaeyx       Mayday (feat. Laura Brehm)
## 11            5510      Takuya                        Pika Girl
## 12            5425     Bearizm Cold Green Eyes ft. Roos Denayer
## 13            5300   Saten-san                  Kokou no Sousei
## 14            4878       gowww                       MATRYOSHKA
## 15            4849      h3k1ru River Flows In You (A Love Note)
## 16            4829   ouranhshc                      Bad Apple!!
## 17            4829 -kevincela-                         Flaklypa
## 18            4770     ktgster                             Lost
## 19            4618  Sekai-nyan              This game (TV Size)
## 20            4591       Kuria                   Answer is Near
# Most played maps
most_played <- head(arrange(beatmaps, desc(playcount)), 20)
subset(most_played, select=c("playcount","creator","title","version"))
##    playcount           creator                            title
## 1   21429513         W h i t e                          My Love
## 2   20405157         W h i t e                          My Love
## 3   18188427       jonathanlfj                        Tear Rain
## 4   14209863       Blue Dragon                    The Big Black
## 5   12840626       jonathanlfj                        Tear Rain
## 6   12456963           ktgster                             Lost
## 7   12124613       -kevincela-                         Flaklypa
## 8   10881187        Charles445    Liquid (Paul Rosenthal Remix)
## 9   10878775       Blue Dragon              Can't Defeat Airman
## 10  10611576       -kevincela-                         Flaklypa
## 11   9287944         W h i t e                          My Love
## 12   8065171 Multiple Creators                          Renatus
## 13   8028482       jonathanlfj                        Tear Rain
## 14   7715600           ktgster                             Lost
## 15   7548246               Rue                         Dear You
## 16   7343941           Bearizm Cold Green Eyes ft. Roos Denayer
## 17   7328010            h3k1ru River Flows In You (A Love Note)
## 18   6820213            VINXIS                         No title
## 19   6818829        Charles445    Liquid (Paul Rosenthal Remix)
## 20   6684374           val0108                     Scarlet Rose
##                          version
## 1                           Hard
## 2                         Normal
## 3                         Normal
## 4  WHO'S AFRAID OF THE BIG BLACK
## 5                           Hard
## 6                         Normal
## 7                         Normal
## 8                           Easy
## 9       Holy Shit! It's Airman!!
## 10                          Hard
## 11                        Insane
## 12                        Normal
## 13                        Insane
## 14                          Hard
## 15                      Dear Rue
## 16                        Divine
## 17                     Love Note
## 18                  Light Insane
## 19                        Normal
## 20                    0108 style

Scatterplots

# Scatterplot of AR vs BPM
ggplot(std, aes(bpm, diff_approach)) + 
  ggtitle("Approach Rate vs BPM") + 
  scale_x_continuous(limits=c(0,500)) + 
  AR_y_scale + 
  geom_point(alpha=0.1)
## Warning: Removed 124 rows containing missing values (geom_point).

# Scatterplot of SR vs total length time
ggplot(std, aes(total_length, difficultyrating)) + 
  ggtitle("Star Rating vs Total Length") +
  length_x_scale + 
  SR_y_scale +
  geom_point(alpha=0.1)
## Warning: Removed 97 rows containing missing values (geom_point).

# Scatterplot of max combo vs drain time
ggplot(std, aes(hit_length, max_combo)) + 
  ggtitle("Max Combo vs Drain Time") + 
  length_x_scale + 
  scale_y_continuous(limits=c(0,4000)) +
  geom_point(alpha=0.05)
## Warning: Removed 73 rows containing missing values (geom_point).

# High linear correlation, as expected
summary(lm(max_combo ~ hit_length, data=std))
## 
## Call:
## lm(formula = max_combo ~ hit_length, data = std)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3860.7  -134.3   -17.4   120.5 23020.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99.20894    2.51272  -39.48   <2e-16 ***
## hit_length    4.89613    0.01715  285.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 272.8 on 56923 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.5888, Adjusted R-squared:  0.5888 
## F-statistic: 8.152e+04 on 1 and 56923 DF,  p-value: < 2.2e-16
# Scatterplot of favorite count vs playcount
ggplot(std, aes(playcount, favourite_count)) + 
  ggtitle("Favorite Count vs Playcount") + 
  scale_x_continuous(limits=c(0,1000000)) +
  scale_y_continuous(limits=c(0,1000)) +
  geom_point(alpha=0.05)
## Warning: Removed 2579 rows containing missing values (geom_point).

# Scatterplot of playcount vs total length
ggplot(std, aes(total_length, playcount)) +
  ggtitle("Playcount vs Total Length") + 
  length_x_scale + 
  scale_y_continuous(limits=c(0,1000000)) +
  geom_point(alpha=0.1)
## Warning: Removed 1099 rows containing missing values (geom_point).

# Scatterplot of AR vs date approved
ggplot(std, aes(approved_date, diff_approach)) + 
  ggtitle("Approach Rate vs Date Approved") + 
  AR_y_scale + 
  approved_x_scale + 
  geom_point(alpha=0.05)

# Scatterplot of SR vs date approved
ggplot(std, aes(approved_date, difficultyrating)) +
  ggtitle("Star Rating vs Date Approved") +
  SR_y_scale + 
  approved_x_scale + 
  geom_point(alpha=0.1)
## Warning: Removed 9 rows containing missing values (geom_point).

Spread info

# Playcount by song time, categorized by spread icon
# https://osu.ppy.sh/help/wiki/Difficulties#star-rating Not sure about values between boundaries
spread.sr = c(0, 1.51, 2.26, 3.76, 5.26, 6.76)  
spread.names = c("Easy", "Normal", "Hard", "Insane", "Expert", "Expert+")
spread.colors = c("olivedrab3", "paleturquoise", "gold", "hotpink", "purple", "darkgray")

# Assign difficulty rating by spread ranges to spread names 
beatmaps$spread_name = spread.names[cut(beatmaps$difficultyrating, spread.sr, right=FALSE, labels=FALSE)]
std <- beatmaps[beatmaps$mode == "Standard",]  # update std 

hitlength.bins = seq(0, 360, 30)
par(mfrow=c(2,3), mar=c(4,4,4,1), cex.main=2)
for (i in 1:length(spread.names)) {
  std.spread = std[std$spread_name == spread.names[i], ]
  playcount.bin.sum = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins)),
                             function(df) sum(df$playcount))

  barplot(playcount.bin.sum, space=0, width=30, xlab="Hit length (s)", ylab="Playcount Total", main=spread.names[i],
          col=spread.colors[i], axisnames=FALSE)
  axis(1, at=hitlength.bins)
}

# Same but 150+ hitlength and stacked bars
hitlength.bins.150 = seq(150, 360, 30)
playcount.bin.mat = matrix(ncol=length(hitlength.bins.150)-1, nrow=length(spread.names))
colnames(playcount.bin.mat) = head(hitlength.bins.150, -1)
rownames(playcount.bin.mat) = spread.names

for (i in 1:nrow(playcount.bin.mat)) {
  std.spread = std[std$spread_name == spread.names[i], ]
  playcount.bin.mat[i,] = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins.150)),
                                 function(df) sum(df$playcount))
}

dev.off()  # Reset par 
## null device 
##           1
barplot(playcount.bin.mat, space=0, width=30, col=spread.colors, xlab="Hitlength (s)", ylab="Total Playcount",
        legend.text=spread.names, axisnames=FALSE, main="Total Playcount by Hitlength and Difficulty")
axis(1, at=hitlength.bins.150-hitlength.bins.150[1], labels=hitlength.bins.150)